Introduction

The aim is to combine label transfer and CNV inference to annotate Wilms tumor samples in SCPCP000006. The proposed annotation will be based on the combination of:

  • the label transfer from the fetal kidney reference (Stewart et al.), in particular the fetal_kidney_predicted.compartment and fetal_kidney:predicted.cell_type, as well as the prediction.score for each compartment,

  • the predicted CNV calculated using intra-sample endothelial and immune cells (--reference both) as normal reference; no reference was used for samples with fewer than 3 predicted normal cells.

In a second time, we will explore and validate the chosen annotation.

We will use some of the markers genes to validate visually the annotations.

The analysis can be summarized as the following:

Where cnv.thr and pred.thr need to be discussed

first level annotation second level annotation selection of the cells marker genes for validation CNV validation
normal endothelial compartment == "endothelium" & predicted.score > pred.thr & cnv_score < cnv.thr VWF no CNV
normal immune compartment == "immune" & predicted.score > pred.thr & cnv_score < cnv.thr PTPRC, CD163, CD68 no CNV
normal kidney cell_type %in% c("kidney cell","kidney epithelial", "podocyte") & predicted.score > pred.thr & cnv_score < cnv.thr CDH1, PODXL, LTL no CNV
normal stroma compartment == "stroma" & predicted.score > pred.thr & cnv_score < cnv.thr VIM no CNV
cancer stroma compartment == "stroma" & cnv_score > cnv.thr VIM proportion_cnv_chr: 1, 4, 11, 16, 17, 18
cancer blastema compartment == "fetal_nephron" & cell_type == "mesenchymal cell" & cnv_score > cnv.thr CITED1 proportion_cnv_chr: 1, 4, 11, 16, 17, 18
cancer epithelial compartment == "fetal_nephron" & cell_type != "mesenchymal cell" & cnv_score > cnv.thr CDH1 proportion_cnv_chr: 1, 4, 11, 16, 17, 18
unknown - the rest of the cells - -

Packages

library(Seurat)
library(tidyverse)
library(patchwork)
library(DT)

Base directories

# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")

# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")

result_dir <- file.path(module_base, "results")

Output files

# cell type annotation table
annotations_tsv <- file.path(result_dir, "SCPCP000006-annotations.tsv")

Input files

In this notebook, we are working with all of the samples in SCPCP000006.

The sample metadata can be found in sample_metadata_file in the data folder.

We extracted from the pre-processed and labeled Seurat object from results/{sample_id}/06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds. the following information per cell:

  • the predicted compartment in fetal_kidney_predicted.compartment and fetal_kidney_predicted.compartment.score

  • the predicted compartment in fetal_kidney_predicted.cell_type and fetal_kidney_predicted.cell_type.score

  • the sample identifier in sample_id

  • the proportion of estimated CNV per chromosome, {i} in 1 to 22 proportion_cnv_chr{i}

  • the counts of markers genes

sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")
metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE)


sample_ids <- metadata |>
  dplyr::filter(seq_unit != "spot") |>
  dplyr::pull(scpca_sample_id) |>
  unique()

# These samples were run with "none" as the reference
none_reference_samples <- c("SCPCS000177", "SCPCS000180", "SCPCS000181", "SCPCS000190", "SCPCS000197")

# Create a data frames of all annotations
cell_type_df <- sample_ids |>
  purrr::map(
    # For each sample_id, do the following:
    \(sample_id) {
      if (sample_id %in% none_reference_samples) {
        reference <- "none"
      } else {
        reference <- "both"
      }

      input_file <- file.path(
        result_dir,
        sample_id,
        glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")
      )

      # The file may not be present if this is being run in CI, which is ok.
      # If we are not running in CI and the file doesn't exist, we should error out
      # We should error out if the file does not exist and we are NOT testing
      if (!file.exists(input_file)) {
        if (params$testing) {
          return(NULL)
        } else {
          stop("Input RDS file does not exist.")
        }
      }

      # Read in the Seurat object
      srat <- readRDS(input_file)

      # Create and return a data frame from the Seurat object with relevant annotations
      # this data frame will have four columns: barcode, sample_id, compartment, organ
      data.frame(
        # label transfer from the fetal kidney reference
        cell_type = srat$fetal_kidney_predicted.cell_type,
        compartment = srat$fetal_kidney_predicted.compartment,

        # predicted.scores from the label transfer from the fetal kidney reference
        cell_type.score = srat$fetal_kidney_predicted.cell_type.score,
        compartment.score = srat$fetal_kidney_predicted.compartment.score,

        # cell embedding
        umap = srat@reductions$umap@cell.embeddings,

        # marker genes
        PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
        VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
        VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"),
        CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"),
        CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"),
        PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"),
        COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"),
        SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"),
        NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"),
        THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts"),

        # proportion of cnv per chromosome
        proportion_cnv_chr1 = srat$proportion_cnv_chr1,
        proportion_cnv_chr2 = srat$proportion_cnv_chr2,
        proportion_cnv_chr3 = srat$proportion_cnv_chr3,
        proportion_cnv_chr4 = srat$proportion_cnv_chr4,
        proportion_cnv_chr5 = srat$proportion_cnv_chr5,
        proportion_cnv_chr6 = srat$proportion_cnv_chr6,
        proportion_cnv_chr7 = srat$proportion_cnv_chr7,
        proportion_cnv_chr8 = srat$proportion_cnv_chr8,
        proportion_cnv_chr9 = srat$proportion_cnv_chr9,
        proportion_cnv_chr10 = srat$proportion_cnv_chr10,
        proportion_cnv_chr11 = srat$proportion_cnv_chr11,
        proportion_cnv_chr12 = srat$proportion_cnv_chr12,
        proportion_cnv_chr13 = srat$proportion_cnv_chr13,
        proportion_cnv_chr14 = srat$proportion_cnv_chr14,
        proportion_cnv_chr15 = srat$proportion_cnv_chr15,
        proportion_cnv_chr16 = srat$proportion_cnv_chr16,
        proportion_cnv_chr17 = srat$proportion_cnv_chr17,
        proportion_cnv_chr18 = srat$proportion_cnv_chr18,
        proportion_cnv_chr19 = srat$proportion_cnv_chr19,
        proportion_cnv_chr20 = srat$proportion_cnv_chr20,
        proportion_cnv_chr21 = srat$proportion_cnv_chr21,
        proportion_cnv_chr22 = srat$proportion_cnv_chr22,

        # cnv global estimation per chromosome
        has_cnv_chr1 = srat$has_cnv_chr1,
        has_cnv_chr2 = srat$has_cnv_chr2,
        has_cnv_chr3 = srat$has_cnv_chr3,
        has_cnv_chr4 = srat$has_cnv_chr4,
        has_cnv_chr5 = srat$has_cnv_chr5,
        has_cnv_chr6 = srat$has_cnv_chr6,
        has_cnv_chr7 = srat$has_cnv_chr7,
        has_cnv_chr8 = srat$has_cnv_chr8,
        has_cnv_chr9 = srat$has_cnv_chr9,
        has_cnv_chr10 = srat$has_cnv_chr10,
        has_cnv_chr11 = srat$has_cnv_chr11,
        has_cnv_chr12 = srat$has_cnv_chr12,
        has_cnv_chr13 = srat$has_cnv_chr13,
        has_cnv_chr14 = srat$has_cnv_chr14,
        has_cnv_chr15 = srat$has_cnv_chr15,
        has_cnv_chr16 = srat$has_cnv_chr16,
        has_cnv_chr17 = srat$has_cnv_chr17,
        has_cnv_chr18 = srat$has_cnv_chr18,
        has_cnv_chr19 = srat$has_cnv_chr19,
        has_cnv_chr20 = srat$has_cnv_chr20,
        has_cnv_chr21 = srat$has_cnv_chr21,
        has_cnv_chr22 = srat$has_cnv_chr22
      ) |>
        tibble::rownames_to_column("barcode") |>
        dplyr::mutate(sample_id = sample_id)
    }
  ) |>
  # now combine all dataframes to make one big one
  dplyr::bind_rows()

Output file

The report will be saved in the notebook directory.

Functions

do_Feature_mean

do_Feature_mean shows heatmap of mean expression of a feature grouped by a metadata.

  • df is the name of the table containing metadata and feature expression (counts) per cells
  • group.by is the name of the metadata to group the cells
  • feature is the name of the gene to average the expression
do_Feature_mean <- function(df, group.by, feature) {
  df <- df %>%
    group_by(sample_id, !!sym(group.by)) %>%
    summarise(m = mean(!!sym(feature)))


  p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
    geom_tile() +
    scale_fill_viridis_c() +
    theme_bw() +
    theme(text = element_text(size = 20)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
    guides(fill = guide_colourbar(title = paste0(feature)))

  return(p)
}

Analysis

Global CNV score

As done in 06_cnv_infercnv_exploration.Rmd, we calculate single CNV score and assess its potential in identifying cells with CNV versus normal cells without CNV.

We simply checked for each chromosome if the cell has_cnv_chr. Would the cell have more than cnv_threshold chromosome with CNV, the global has_cnv_score will be TRUE. Else, the cell will have a has_cnv_score set to FALSE.

cell_type_df <- cell_type_df |>
  mutate(has_cnv_score = rowSums(cell_type_df[, grepl("has_cnv_chr", colnames(cell_type_df))])) |>
  mutate(has_cnv_score = case_when(
    has_cnv_score > params$cnv_threshold ~ "CNV",
    has_cnv_score <= params$cnv_threshold ~ "no CNV"
  ))


table(cell_type_df$has_cnv_score)

CNV no CNV 187292 13298

First level annotation

At first, we like to indicate in the first.level_annotation if a cell is normal, cancer or unknown.

  • normal cells can be observe in all four compartments (endothelium, immune, stroma or fetal nephron) and do not have CNV We only allow a bit of flexibility in terms of CNV profile for immune and endothelium cells that have a high predicted score. Indeed, we know that false positive CNV can be observed in a cell type specific manner.

The threshold used for the predicted.score is defined as a parameter of this notebook as 0.85. The threshold used for the identification of CNV is also defined as the notebook parameter 0.

  • cancer cells are either from the stroma or fetal nephron compartments and must have at least few CNV
# Define normal cells
# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold
cell_type_df <- cell_type_df |>
  mutate(first.level_annotation = case_when(
    # assign normal/cancer based on condition
    compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "no CNV" ~ "normal",
    compartment %in% c("endothelium", "immune") & cell_type.score > params$predicted.celltype.threshold ~ "normal",
    compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "CNV" ~ "cancer",
    .default = "unknown"
  ))

Using this basic strategy, we identified 186282 cancer cells, 10896 normal cells. Only 3412cells remain unknown

ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = first.level_annotation), shape = 19, size = 1) +
  geom_point() +
  facet_wrap(facets = ~sample_id, ncol = 5) +
  theme_bw() +
  theme(text = element_text(size = 22))

Second level annotation

Normal cells

  • Normal cells from the fetal nephron compartment must be normal kidney cells.

  • Normal cells from the stroma compartment must be normal stroma cells.

  • Immune and endothelial cells have been already identified by label transfer.

Cancer cells

Wilms tumor cancer cells can be:

  • cancer stroma: We define as cancer stroma all cancer cells from the stroma compartment.

  • blastema,: we defined as blastema every cancer cell that has a fetal_kidney_predicted.cell_type == mesenchymal cell. We know that these mesenchymal cells are cells from the cap mesenchyme that are not expected to be in a mature kidney. These blastema cells should express higher CITED1.

  • cancer epithelium: we defined as cancer epithelium all cancer cells that are neither stroma nor blastemal cells. We expect these cells to express epithelial markers. Their predicted cell type should correspond to more mature kidney epithelial subunits.

cell_type_df <- cell_type_df |>
  mutate(second.level_annotation = case_when(
    # assign normal cells based on condition
    cell_type_df$compartment %in% c("fetal_nephron") &
      cell_type_df$has_cnv_score == "no CNV" ~ "kidney",
    cell_type_df$compartment %in% c("stroma") &
      cell_type_df$has_cnv_score == "no CNV" ~ "normal stroma",
    cell_type_df$compartment %in% c("endothelium") &
      (cell_type_df$compartment.score > params$predicted.celltype.threshold |
        cell_type_df$has_cnv_score == "no CNV") ~ "endothelium",
    cell_type_df$compartment %in% c("immune") &
      (cell_type_df$compartment.score > params$predicted.celltype.threshold |
        cell_type_df$has_cnv_score == "no CNV") ~ "immune",

    # assign cancer cells based on condition
    cell_type_df$compartment %in% c("stroma") &
      cell_type_df$has_cnv_score == "CNV" ~ "cancer stroma",
    cell_type_df$compartment %in% c("fetal_nephron") &
      cell_type_df$has_cnv_score == "CNV" &
      cell_type_df$cell_type == "mesenchymal cell" ~ "blastema",
    cell_type_df$compartment %in% c("fetal_nephron") &
      cell_type_df$has_cnv_score == "CNV" &
      cell_type_df$cell_type != "mesenchymal cell" ~ "cancer epithelium",
    .default = "unknown"
  ))
ggplot(cell_type_df[cell_type_df$first.level_annotation == "normal", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) +
  geom_point() +
  facet_wrap(facets = ~sample_id, ncol = 5) +
  theme_bw() +
  theme(text = element_text(size = 22))

ggplot(cell_type_df[cell_type_df$first.level_annotation == "cancer", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 0.1) +
  geom_point() +
  facet_wrap(facets = ~sample_id, ncol = 5) +
  theme_bw() +
  theme(text = element_text(size = 22))

Cancer and normal cells

ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) +
  geom_point() +
  facet_wrap(facets = ~sample_id, ncol = 5) +
  theme_bw() +
  theme(text = element_text(size = 22))

Validation cancer versus normal based on the CNV profile

for (i in 1:22) {
  print(do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = glue::glue("proportion_cnv_chr", i)))
}

Validation of second level annotation using marker genes

Immune, PTPRC expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000081237")

Endothelium, VWF expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000110799")

Stroma, Vimentin expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000026025")

Stroma, COL6A3 expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000163359")

Stroma, THY1 expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000154096")

Blastema, CITED1 expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000125931")

Blastema, NCAM1 expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000149294")

stemness marker (blastema and primitive epithelium), SIX2 expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000170577")

Epithelium, CDH1 expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000039068")

Epithelium, PODXL expression

do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = "ENSG00000128567")

Create annotation table for export

This section creates the cell type annotation table for export.

annotations_table <- cell_type_df |>
  select(
    cell_barcode = barcode,
    scpca_sample_id = sample_id,
    tumor_cell_classification = first.level_annotation,
    cell_type_assignment = second.level_annotation
  ) |>
  mutate(
    # change cancer --> tumor, but keep the other labels
    tumor_cell_classification = ifelse(
      tumor_cell_classification == "cancer", "tumor", tumor_cell_classification
    ),
    cell_type_assignment = str_replace_all(
      cell_type_assignment,
      "cancer ",
      "tumor "
    )
  )

write_tsv(annotations_table, annotations_tsv)

Confirm how many samples we have annotations for:

length(unique(annotations_table$scpca_sample_id))
## [1] 40

Conclusion

  • Combining label transfer and CNV inference we have produced draft annotations for all 40 Wilms tumor samples in SCPCP000006

  • The heatmaps of CNV proportion and marker genes support our annotations, but signals with some marker genes are very low. Also, there is no universal marker for each entity of Wilms tumor that cover all tumor cells from all patient. This makes the validation of the annotations quite difficult.

  • However, we could try to take the problem from the other side, and used the current annotation to perform differential expression analysis and try to find marker genes that are consistent across patient and Wilms tumor histologies.

  • In each histology (i.e. epithelial and stroma), the distinction between cancer and non cancer cell is difficult (as expected). In this analysis, we suggested to rely on the CNV score to assess the normality of the cell. Here again, we could try to run differential expression analysis and compare epithelial (resp. stroma) cancer versus non-cancer cells across patient, aiming to find a share transcriptional program allowing the classification cancer versus normal.

  • In our annotation, we haven’t taken into account the favorable/anaplastic status of the sample. However, as anaplasia can occur in every (but do not has to) Wilms tumor histology, I am not sure how to integrate the information into the annotation.

  • This notebook could be finally rendered using different parameters, i.e. threshold for the CNV score and predicted score to use.

Session Info

# record the versions of the packages used in this analysis and other environment information
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.33            patchwork_1.2.0    lubridate_1.9.3    forcats_1.0.0     
##  [5] stringr_1.5.1      dplyr_1.1.4        purrr_1.0.2        readr_2.1.5       
##  [9] tidyr_1.3.1        tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0   
## [13] Seurat_5.1.0       SeuratObject_5.0.2 sp_2.1-4          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     rstudioapi_0.16.0      jsonlite_1.8.8        
##   [4] magrittr_2.0.3         spatstat.utils_3.1-0   farver_2.1.2          
##   [7] rmarkdown_2.28         vctrs_0.6.5            ROCR_1.0-11           
##  [10] spatstat.explore_3.3-2 htmltools_0.5.8.1      sass_0.4.9            
##  [13] sctransform_0.4.1      parallelly_1.38.0      KernSmooth_2.23-24    
##  [16] bslib_0.8.0            htmlwidgets_1.6.4      ica_1.0-3             
##  [19] plyr_1.8.9             plotly_4.10.4          zoo_1.8-12            
##  [22] cachem_1.1.0           igraph_2.0.3           mime_0.12             
##  [25] lifecycle_1.0.4        pkgconfig_2.0.3        Matrix_1.7-0          
##  [28] R6_2.5.1               fastmap_1.2.0          fitdistrplus_1.2-1    
##  [31] future_1.34.0          shiny_1.9.1            digest_0.6.37         
##  [34] colorspace_2.1-1       rprojroot_2.0.4        tensor_1.5            
##  [37] RSpectra_0.16-2        irlba_2.3.5.1          labeling_0.4.3        
##  [40] progressr_0.14.0       fansi_1.0.6            spatstat.sparse_3.1-0 
##  [43] timechange_0.3.0       httr_1.4.7             polyclip_1.10-7       
##  [46] abind_1.4-5            compiler_4.4.0         bit64_4.0.5           
##  [49] withr_3.0.1            fastDummies_1.7.4      highr_0.11            
##  [52] MASS_7.3-61            tools_4.4.0            lmtest_0.9-40         
##  [55] httpuv_1.6.15          future.apply_1.11.2    goftest_1.2-3         
##  [58] glue_1.7.0             nlme_3.1-166           promises_1.3.0        
##  [61] grid_4.4.0             Rtsne_0.17             cluster_2.1.6         
##  [64] reshape2_1.4.4         generics_0.1.3         gtable_0.3.5          
##  [67] spatstat.data_3.1-2    tzdb_0.4.0             data.table_1.16.0     
##  [70] hms_1.1.3              utf8_1.2.4             spatstat.geom_3.3-2   
##  [73] RcppAnnoy_0.0.22       ggrepel_0.9.5          RANN_2.6.2            
##  [76] pillar_1.9.0           vroom_1.6.5            spam_2.10-0           
##  [79] RcppHNSW_0.6.0         later_1.3.2            splines_4.4.0         
##  [82] lattice_0.22-6         bit_4.0.5              survival_3.7-0        
##  [85] deldir_2.0-4           tidyselect_1.2.1       miniUI_0.1.1.1        
##  [88] pbapply_1.7-2          knitr_1.48             gridExtra_2.3         
##  [91] scattermore_1.2        xfun_0.47              matrixStats_1.3.0     
##  [94] stringi_1.8.4          lazyeval_0.2.2         yaml_2.3.10           
##  [97] evaluate_0.24.0        codetools_0.2-20       cli_3.6.3             
## [100] uwot_0.2.2             xtable_1.8-4           reticulate_1.38.0     
## [103] munsell_0.5.1          jquerylib_0.1.4        Rcpp_1.0.13           
## [106] globals_0.16.3         spatstat.random_3.3-1  png_0.1-8             
## [109] spatstat.univar_3.0-0  parallel_4.4.0         dotCall64_1.1-1       
## [112] listenv_0.9.1          viridisLite_0.4.2      scales_1.3.0          
## [115] ggridges_0.5.6         crayon_1.5.3           leiden_0.4.3.1        
## [118] rlang_1.1.4            cowplot_1.1.3